{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d28595f6-33ab-4d4d-8373-b4db36681366",
   "metadata": {},
   "source": [
    "## Backtranslation for Data Augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0bef5a6b-bf9e-4b41-aace-d0d98ac42ce4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Author: Sebastian Raschka\n",
      "\n",
      "Python implementation: CPython\n",
      "Python version       : 3.10.6\n",
      "IPython version      : 8.12.0\n",
      "\n",
      "transformers: 4.27.2\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -a 'Sebastian Raschka' -v -p transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "01a56e71-a68d-4c99-ba42-d32b6f626621",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import MarianMTModel, MarianTokenizer\n",
    "\n",
    "def back_translate(text):\n",
    "    # English to German\n",
    "    en_to_de_model_name = \"Helsinki-NLP/opus-mt-en-de\"\n",
    "    en_to_de_tokenizer = MarianTokenizer.from_pretrained(en_to_de_model_name)\n",
    "    en_to_de_model = MarianMTModel.from_pretrained(en_to_de_model_name)\n",
    "    \n",
    "    inputs = en_to_de_tokenizer([text], return_tensors=\"pt\")\n",
    "    translated_german_tokens = en_to_de_model.generate(**inputs)\n",
    "    translated_german_text = en_to_de_tokenizer.decode(translated_german_tokens[0], skip_special_tokens=True)\n",
    "    \n",
    "    # German to English\n",
    "    de_to_en_model_name = 'Helsinki-NLP/opus-mt-de-en'\n",
    "    de_to_en_tokenizer = MarianTokenizer.from_pretrained(de_to_en_model_name)\n",
    "    de_to_en_model = MarianMTModel.from_pretrained(de_to_en_model_name)\n",
    "\n",
    "    inputs_back = de_to_en_tokenizer([translated_german_text], return_tensors=\"pt\")\n",
    "    translated_english_tokens = de_to_en_model.generate(**inputs_back)\n",
    "    translated_back_english_text = de_to_en_tokenizer.decode(translated_english_tokens[0], skip_special_tokens=True)\n",
    "\n",
    "    return translated_german_text, translated_back_english_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d1807b7e-88da-41d4-9d75-27bbfc14278d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original text:\n",
      "Despite the intermittent rain showers, Amelia decided to venture outside with her new umbrella, hoping to enjoy the fresh air and perhaps bump into some old friends at the local café down the street.\n",
      "--------------------------\n",
      "Translated text:\n",
      "Trotz der periodischen Regenschauer entschied sich Amelia, sich mit ihrem neuen Regenschirm nach draußen zu wagen, in der Hoffnung, die frische Luft zu genießen und vielleicht einige alte Freunde im örtlichen Café auf der Straße zu treffen.\n",
      "--------------------------\n",
      "Backtranslated text:\n",
      "Despite the periodic rain showers, Amelia decided to venture outside with her new umbrella, hoping to enjoy the fresh air and perhaps meet some old friends in the local café on the street.\n",
      "--------------------------\n"
     ]
    }
   ],
   "source": [
    "text = (\"Despite the intermittent rain showers, \"\n",
    "        \"Amelia decided to venture outside with \"\n",
    "        \"her new umbrella, hoping to enjoy the fresh \"\n",
    "        \"air and perhaps bump into some old friends \"\n",
    "        \"at the local café down the street.\"\n",
    "       )\n",
    "\n",
    "translated_text, back_translated_text = back_translate(text)\n",
    "\n",
    "print(\"Original text:\")\n",
    "print(text)\n",
    "print(\"--------------------------\")\n",
    "\n",
    "print(\"Translated text:\")\n",
    "print(translated_text)\n",
    "print(\"--------------------------\")\n",
    "    \n",
    "print(\"Backtranslated text:\")\n",
    "print(back_translated_text)\n",
    "print(\"--------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "370ccc24-102a-4df5-a1b3-04448e2e6669",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Despite\n",
      "  the\n",
      "- intermittent\n",
      "+ periodic\n",
      "  rain\n",
      "  showers,\n",
      "  Amelia\n",
      "  decided\n",
      "  to\n",
      "  venture\n",
      "  outside\n",
      "  with\n",
      "  her\n",
      "  new\n",
      "  umbrella,\n",
      "  hoping\n",
      "  to\n",
      "  enjoy\n",
      "  the\n",
      "  fresh\n",
      "  air\n",
      "  and\n",
      "  perhaps\n",
      "+ meet\n",
      "- bump\n",
      "- into\n",
      "  some\n",
      "  old\n",
      "  friends\n",
      "- at\n",
      "+ in\n",
      "  the\n",
      "  local\n",
      "  café\n",
      "- down\n",
      "+ on\n",
      "  the\n",
      "  street.\n"
     ]
    }
   ],
   "source": [
    "import difflib\n",
    "\n",
    "\n",
    "d = difflib.Differ()\n",
    "diff = d.compare(text.split(), \n",
    "                 back_translated_text.split())\n",
    "\n",
    "print('\\n'.join(diff))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59e92098-cd36-4878-b371-c3296b467bf0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
